// Geometric Tools, LLC
// Copyright (c) 1998-2014
// Distributed under the Boost Software License, Version 1.0.
// http://www.boost.org/LICENSE_1_0.txt
// http://www.geometrictools.com/License/Boost/LICENSE_1_0.txt
//
// File Version: 5.12.0 (2014/07/02)

// NOTE: This code was written for the upcoming Geometric Tools Engine but
// has been back-ported to Wild Magic 5 to replace its badly implemented
// version.

#ifndef WM5SINGULARVALUEDECOMPOSITIONGTE_H
#define WM5SINGULARVALUEDECOMPOSITIONGTE_H

#include "Wm5MathematicsLIB.h"

// The SingularValueDecomposition class is an implementation of Algorithm
// 8.3.2 (The SVD Algorithm) described in "Matrix Computations, 2nd
// edition" by G. H. Golub and Charles F. Van Loan, The Johns Hopkins
// Press, Baltimore MD, Fourth Printing 1993.  Algorithm 5.4.2 (Householder
// Bidiagonalization) is used to reduce A to bidiagonal B.  Algorithm 8.3.1
// (Golub-Kahan SVD Step) is used for the iterative reduction from bidiagonal
// to diagonal.  If A is the original matrix, S is the matrix whose diagonal
// entries are the singular values, and U and V are corresponding matrices,
// then theoretically U^T*A*V = S.  Numerically, we have errors
// E = U^T*A*V - S.  Algorithm 8.3.2 mentions that one expects |E| is
// approximately u*|A|, where |M| denotes the Frobenius norm of M and where
// u is the unit roundoff for the floating-point arithmetic: 2^{-23} for
// 'float', which is FLT_EPSILON = 1.192092896e-7f, and 2^{-52} for'double',
// which is DBL_EPSILON = 2.2204460492503131e-16.
//
// The condition |a(i,i+1)| <= epsilon*(|a(i,i) + a(i+1,i+1)|) used to
// determine when the reduction decouples to smaller problems is implemented
// as:  sum = |a(i,i)| + |a(i+1,i+1)|; sum + |a(i,i+1)| == sum.  The idea is
// that the superdiagonal term is small relative to its diagonal neighbors,
// and so it is effectively zero.  The unit tests have shown that this
// interpretation of decoupling is effective.
//
// The condition |a(i,i)| <= epsilon*|B| used to determine when the
// reduction decouples (with a zero singular value) is implemented using
// the Frobenius norm of B and epsilon = multiplier*u, where for now the
// multiplier is hard-coded in Solve(...) as 8.
//
// The authors suggest that once you have the bidiagonal matrix, a practical
// implementation will store the diagonal and superdiagonal entries in linear
// arrays, ignoring the theoretically zero values not in the 2-band.  This is
// good for cache coherence, and we have used the suggestion.  The essential
// parts of the Householder u-vectors are stored in the lower-triangular
// portion of the matrix and the essential parts of the Householder v-vectors
// are stored in the upper-triangular portion of the matrix.  To avoid having
// to recompute 2/Dot(u,u) and 2/Dot(v,v) when constructing orthogonal U and
// V, we store these quantities in additional memory during bidiagonalization.
//
// For matrices with randomly generated values in [0,1], the unit tests
// produce the following information for N-by-N matrices.
//
// N  |A|     |E|        |E|/|A|    iterations
// -------------------------------------------
//  2  1.4831 4.1540e-16 2.8007e-16  1
//  3  2.1065 3.5024e-16 1.6626e-16  4
//  4  2.4979 7.4605e-16 2.9867e-16  6
//  5  3.6591 1.8305e-15 5.0025e-16  9
//  6  4.0572 2.0571e-15 5.0702e-16 10
//  7  4.7745 2.9057e-15 6.0859e-16 12
//  8  5.1964 2.7958e-15 5.3803e-16 13
//  9  5.7599 3.3128e-15 5.7514e-16 16
// 10  6.2700 3.7209e-15 5.9344e-16 16
// 11  6.8220 5.0580e-15 7.4142e-16 18
// 12  7.4540 5.2493e-15 7.0422e-16 21
// 13  8.1225 5.6043e-15 6.8997e-16 24
// 14  8.5883 5.8553e-15 6.8177e-16 26
// 15  9.1337 6.9663e-15 7.6270e-16 27
// 16  9.7884 9.1358e-15 9.3333e-16 29
// 17 10.2407 8.2715e-15 8.0771e-16 34
// 18 10.7147 8.9748e-15 8.3761e-16 33
// 19 11.1887 1.0094e-14 9.0220e-16 32
// 20 11.7739 9.7000e-15 8.2386e-16 35
// 21 12.2822 1.1217e-14 9.1329e-16 36
// 22 12.7649 1.1071e-14 8.6732e-16 37
// 23 13.3366 1.1271e-14 8.4513e-16 41
// 24 13.8505 1.2806e-14 9.2460e-16 43
// 25 14.4332 1.3081e-14 9.0637e-16 43
// 26 14.9301 1.4882e-14 9.9680e-16 46
// 27 15.5214 1.5571e-14 1.0032e-15 48
// 28 16.1029 1.7553e-14 1.0900e-15 49
// 29 16.6407 1.6219e-14 9.7467e-16 53
// 30 17.1891 1.8560e-14 1.0797e-15 55
// 31 17.7773 1.8522e-14 1.0419e-15 56
//
// The singularvalues and |E|/|A| values were compared to those generated by
// Mathematica Version 9.0; Wolfram Research, Inc., Champaign IL, 2012.
// The results were all comparable with singular values agreeing to a large
// number of decimal places.
//
// Timing on an Intel (R) Core (TM) i7-3930K CPU @ 3.20 GHZ (in seconds)
// for NxN matrices:
//
// N    |E|/|A|    iters bidiag  QR     U-and-V    comperr
// -------------------------------------------------------
//  512 3.8632e-15  848   0.341  0.016    1.844      2.203
// 1024 5.6456e-15 1654   4.279  0.032   18.765     20.844 
// 2048 7.5499e-15 3250  40.421  0.141  186.607    213.216
//
// where iters is the number of QR steps taken, bidiag is the computation
// of the Householder reflection vectors, U-and-V is the composition of
// Householder reflections and Givens rotations to obtain the orthogonal
// matrices of the decomposigion, and comperr is the computation E =
// U^T*A*V - S.

namespace Wm5
{

template <typename Real>
class WM5_MATHEMATICS_ITEM SingularValueDecompositionGTE
{
public:
    // The solver processes MxN symmetric matrices, where M >= N > 1
    // ('numRows' is M and 'numCols' is N) and the matrix is stored in
    // row-major order.  The maximum number of iterations ('maxIterations')
    // must be specified for the reduction of a bidiagonal matrix to a
    // diagonal matrix.  The goal is to compute MxM orthogonal U, NxN
    // orthogonal V, and MxN matrix S for which U^T*A*V = S.  The only
    // nonzero entries of S are on the diagonal; the diagonal entries are
    // the singular values of the original matrix.
    SingularValueDecompositionGTE(int numRows, int numCols,
        unsigned int maxIterations);

    // A copy of the MxN input is made internally.  The order of the singular
    // values is specified by sortType: -1 (decreasing), 0 (no sorting), or +1
    // (increasing).  When sorted, the columns of the orthogonal matrices
    // are ordered accordingly.  The return value is the number of iterations
    // consumed when convergence occurred, 0xFFFFFFFF when convergence did not
    // occur or 0 when N <= 1 or M < N was passed to the constructor.
    unsigned int Solve(Real const* input, int sortType);

    // Get the singular values of the matrix passed to Solve(...).  The input
    // 'singularValues' must have N elements.
    void GetSingularValues(Real* singularValues) const;

    // Accumulate the Householder reflections, the Givens rotations, and the
    // diagonal fix-up matrix to compute the orthogonal matrices U and V for
    // which U^T*A*V = S.  The input uMatrix must be MxM and the input vMatrix
    // must be NxN, both stored in row-major order.
    void GetU(Real* uMatrix) const;
    void GetV(Real* vMatrix) const;

private:
    // Bidiagonalize using Householder reflections.  On input, mMatrix is a
    // copy of the input matrix and has one extra row.  On output, the
    // diagonal and superdiagonal contain the bidiagonalized results.  The
    // lower-triangular portion stores the essential parts of the Householder
    // u vectors (the elements of u after the leading 1-valued component) and
    // the upper-triangular portion stores the essential parts of the
    // Householder v vectors.  To avoid recomputing 2/Dot(u,u) and 2/Dot(v,v),
    // these quantities are stored in mTwoInvUTU and mTwoInvVTV.
    void Bidiagonalize();

    // A helper for generating Givens rotation sine and cosine robustly.
    void GetSinCos(Real u, Real v, Real& cs, Real& sn);

    // Test for (effectively) zero-valued diagonal entries (through all but
    // the last).  For each such entry, the B matrix decouples.  Perform
    // that decoupling.  If there are no zero-valued entries, then the
    // Golub-Kahan step must be performed.
    bool DiagonalEntriesNonzero(int imin, int imax, Real threshold);

    // This is Algorithm 8.3.1 in "Matrix Computations, 2nd edition" by
    // G. H. Golub and C. F. Van Loan.
    void DoGolubKahanStep(int imin, int imax);

    // The diagonal entries are not guaranteed to be nonnegative during the
    // construction.  After convergence to a diagonal matrix S, test for
    // negative entries and build a diagonal matrix that reverses the sign
    // on the S-entry.
    void EnsureNonnegativeDiagonal();

    // Sort the singular values and compute the corresponding permutation of
    // the indices of the array storing the singular values.  The permutation
    // is used for reordering the singular values and the corresponding
    // columns of the orthogonal matrix in the calls to GetSingularValues(...)
    // and GetOrthogonalMatrices(...).
    void ComputePermutation(int sortType);

    // The number rows and columns of the matrices to be processed.
    int mNumRows, mNumCols;

    // The maximum number of iterations for reducing the bidiagonal matrix
    // to a diagonal matrix.
    unsigned int mMaxIterations;

    // The internal copy of a matrix passed to the solver.  See the comments
    // about function Bidiagonalize() about what is stored in the matrix.
    std::vector<Real> mMatrix;  // MxN elements

    // After the initial bidiagonalization by Householder reflections, we no
    // longer need the full mMatrix.  Copy the diagonal and superdiagonal
    // entries to linear arrays in order to be cache friendly.
    std::vector<Real> mDiagonal;  // N elements
    std::vector<Real> mSuperdiagonal;  // N-1 elements

    // The Givens rotations used to reduce the initial bidiagonal matrix to
    // a diagonal matrix.  A rotation is the identity with the following
    // replacement entries:  R(index0,index0) = cs, R(index0,index1) = sn,
    // R(index1,index0) = -sn, and R(index1,index1) = cs.  If N is the
    // number of matrix columns and K is the maximum number of iterations, the
    // maximum number of right or left Givens rotations is K*(N-1).  The
    // maximum amount of memory is allocated to store these.  However, we also
    // potentially need left rotations to decouple the matrix when a diagonal
    // terms are zero.  Worst case is a number of matrices quadratic in N, so
    // for now we just use std::vector<Rotation> whose initial capacity is
    // K*(N-1).
    struct WM5_MATHEMATICS_ITEM GivensRotation
    {
        GivensRotation();
        GivensRotation(int inIndex0, int inIndex1, Real inCs, Real inSn);
        int index0, index1;
        Real cs, sn;
    };

    std::vector<GivensRotation> mRGivens;
    std::vector<GivensRotation> mLGivens;

    // The diagonal matrix that is used to convert S-entries to nonnegative.
    std::vector<Real> mFixupDiagonal;  // N elements

    // When sorting is requested, the permutation associated with the sort is
    // stored in mPermutation.  When sorting is not requested, mPermutation[0]
    // is set to -1.  mVisited is used for finding cycles in the permutation.
    struct SortItem
    {
        Real singularValue;
        int index;

        bool operator<(SortItem const& item) const
        {
            return singularValue < item.singularValue;
        }

        bool operator>(SortItem const& item) const
        {
            return singularValue > item.singularValue;
        }
    };
    std::vector<int> mPermutation;  // N elements
    mutable std::vector<int> mVisited;  // N elements

    // Temporary storage to compute Householder reflections and to support
    // sorting of columns of the orthogonal matrices.
    std::vector<Real> mTwoInvUTU;  // N elements
    std::vector<Real> mTwoInvVTV;  // N-2 elements
    mutable std::vector<Real> mUVector;  // M elements
    mutable std::vector<Real> mVVector;  // N elements
    mutable std::vector<Real> mWVector;  // max(M,N) elements
};

typedef SingularValueDecompositionGTE<float> SingularValueDecompositionGTEf;
typedef SingularValueDecompositionGTE<double> SingularValueDecompositionGTEd;

}

#endif
